package com.aphrodite.util.regex;

import java.util.Vector;

import com.aphrodite.util.CharUtil;
import com.aphrodite.util.CharacterIterator;
import com.aphrodite.util.StringCharacterIterator;

/**
 * 
 * Regular Expression compiler
 * 
 * <p>
 * 
 * <pre>
 * RegEx r = new RegEx(&quot;a*b&quot;);
 * </pre>
 * 
 * Once you have done this, you can call either of the RE.match methods to
 * perform matching on a String. For example:
 * 
 * <pre>
 * boolean matched = r.match(&quot;aaaab&quot;);
 * </pre>
 * 
 * will cause the boolean matched to be set to true because the pattern "a*b"
 * matches the string "aaaab".
 * 
 * <p>
 * If you were interested in the <i>number</i> of a's which matched the first
 * part of our example expression, you could change the expression to "(a*)b".
 * Then when you compiled the expression and matched it against something like
 * "xaaaab", you would get results like this:
 * 
 * <pre>
 * RegEx r = new RegEx(&quot;(a*)b&quot;); // Compile expression
 * boolean matched = r.match(&quot;xaaaab&quot;); // Match against &quot;xaaaab&quot;
 * 
 * String wholeExpr = r.getParen(0); // wholeExpr will be 'aaaab'
 * String insideParens = r.getParen(1); // insideParens will be 'aaaa'
 * 
 * int startWholeExpr = r.getParenStart(0); // startWholeExpr will be index 1
 * int endWholeExpr = r.getParenEnd(0); // endWholeExpr will be index 6
 * int lenWholeExpr = r.getParenLength(0); // lenWholeExpr will be 5
 * 
 * int startInside = r.getParenStart(1); // startInside will be index 1
 * int endInside = r.getParenEnd(1); // endInside will be index 5
 * int lenInside = r.getParenLength(1); // lenInside will be 4
 * </pre>
 * 
 * You can also refer to the contents of a parenthesized expression within a
 * regular expression itself. This is called a 'backreference'. The first
 * backreference in a regular expression is denoted by \1, the second by \2 and
 * so on. So the expression:
 * 
 * <pre>
 *  ([0-9]+)=\1
 * </pre>
 * 
 * will match any string of the form n=n (like 0=0 or 2=2).
 * 
 * <p>
 * The full regular expression syntax accepted by RE is described here:
 * 
 * <pre>
 * 
 *  &lt;b&gt;&lt;font face=times roman&gt;Characters&lt;/font&gt;&lt;/b&gt;
 * 
 *    &lt;i&gt;unicodeChar&lt;/i&gt;   Matches any identical unicode character
 *    \                    Used to quote a meta-character (like '*')
 *    \\                   Matches a single '\' character
 *    \0nnn                Matches a given octal character
 *    \xhh                 Matches a given 8-bit hexadecimal character
 *    \\uhhhh              Matches a given 16-bit hexadecimal character
 *    \t                   Matches an ASCII tab character
 *    \n                   Matches an ASCII newline character
 *    \r                   Matches an ASCII return character
 *    \f                   Matches an ASCII form feed character
 * 
 * 
 *  &lt;b&gt;&lt;font face=times roman&gt;Character Classes&lt;/font&gt;&lt;/b&gt;
 * 
 *    [abc]                Simple character class
 *    [a-zA-Z]             Character class with ranges
 *    [&circ;abc]               Negated character class
 * </pre>
 * 
 * <b>NOTE:</b> Incomplete ranges will be interpreted as &quot;starts from
 * zero&quot; or &quot;ends with last character&quot;. <br>
 * I.e. [-a] is the same as [\\u0000-a], and [a-] is the same as [a-\\uFFFF],
 * [-] means &quot;all characters&quot;.
 * 
 * <pre>
 * 
 *  &lt;b&gt;&lt;font face=times roman&gt;Standard POSIX Character Classes&lt;/font&gt;&lt;/b&gt;
 * 
 *    [:alnum:]            Alphanumeric characters.
 *    [:alpha:]            Alphabetic characters.
 *    [:blank:]            Space and tab characters.
 *    [:cntrl:]            Control characters.
 *    [:digit:]            Numeric characters.
 *    [:graph:]            Characters that are printable and are also visible.
 *                         (A space is printable, but not visible, while an
 *                         `a' is both.)
 *    [:lower:]            Lower-case alphabetic characters.
 *    [:print:]            Printable characters (characters that are not
 *                         control characters.)
 *    [:punct:]            Punctuation characters (characters that are not letter,
 *                         digits, control characters, or space characters).
 *    [:space:]            Space characters (such as space, tab, and formfeed,
 *                         to name a few).
 *    [:upper:]            Upper-case alphabetic characters.
 *    [:xdigit:]           Characters that are hexadecimal digits.
 * 
 * 
 *  &lt;b&gt;&lt;font face=times roman&gt;Non-standard POSIX-style Character Classes&lt;/font&gt;&lt;/b&gt;
 * 
 *    [:javastart:]        Start of a Java identifier
 *    [:javapart:]         Part of a Java identifier
 * 
 * 
 *  &lt;b&gt;&lt;font face=times roman&gt;Predefined Classes&lt;/font&gt;&lt;/b&gt;
 * 
 *    .         Matches any character other than newline
 *    \w        Matches a &quot;word&quot; character (alphanumeric plus &quot;_&quot;)
 *    \W        Matches a non-word character
 *    \s        Matches a whitespace character
 *    \S        Matches a non-whitespace character
 *    \d        Matches a digit character
 *    \D        Matches a non-digit character
 * 
 * 
 *  &lt;b&gt;&lt;font face=times roman&gt;Boundary Matchers&lt;/font&gt;&lt;/b&gt;
 * 
 *    &circ;         Matches only at the beginning of a line
 *    $         Matches only at the end of a line
 *    \b        Matches only at a word boundary
 *    \B        Matches only at a non-word boundary
 * 
 * 
 *  &lt;b&gt;&lt;font face=times roman&gt;Greedy Closures&lt;/font&gt;&lt;/b&gt;
 * 
 *    A*        Matches A 0 or more times (greedy)
 *    A+        Matches A 1 or more times (greedy)
 *    A?        Matches A 1 or 0 times (greedy)
 *    A{n}      Matches A exactly n times (greedy)
 *    A{n,}     Matches A at least n times (greedy)
 *    A{n,m}    Matches A at least n but not more than m times (greedy)
 * 
 * 
 *  &lt;b&gt;&lt;font face=times roman&gt;Reluctant Closures&lt;/font&gt;&lt;/b&gt;
 * 
 *    A*?       Matches A 0 or more times (reluctant)
 *    A+?       Matches A 1 or more times (reluctant)
 *    A??       Matches A 0 or 1 times (reluctant)
 * 
 * 
 *  &lt;b&gt;&lt;font face=times roman&gt;Logical Operators&lt;/font&gt;&lt;/b&gt;
 * 
 *    AB        Matches A followed by B
 *    A|B       Matches either A or B
 *    (A)       Used for subexpression grouping
 *   (?:A)      Used for subexpression clustering (just like grouping but
 *              no backrefs)
 * 
 * 
 *  &lt;b&gt;&lt;font face=times roman&gt;Backreferences&lt;/font&gt;&lt;/b&gt;
 * 
 *    \1    Backreference to 1st parenthesized subexpression
 *    \2    Backreference to 2nd parenthesized subexpression
 *    \3    Backreference to 3rd parenthesized subexpression
 *    \4    Backreference to 4th parenthesized subexpression
 *    \5    Backreference to 5th parenthesized subexpression
 *    \6    Backreference to 6th parenthesized subexpression
 *    \7    Backreference to 7th parenthesized subexpression
 *    \8    Backreference to 8th parenthesized subexpression
 *    \9    Backreference to 9th parenthesized subexpression
 * </pre>
 * 
 * <p>
 * All closure operators (+, *, ?, {m,n}) are greedy by default, meaning that
 * they match as many elements of the string as possible without causing the
 * overall match to fail. If you want a closure to be reluctant (non-greedy),
 * you can simply follow it with a '?'. A reluctant closure will match as few
 * elements of the string as possible when finding matches. {m,n} closures don't
 * currently support reluctancy.
 * 
 * <p>
 * <b><font face="times roman">Line terminators</font></b> <br>
 * A line terminator is a one- or two-character sequence that marks the end of a
 * line of the input character sequence. The following are recognized as line
 * terminators:
 * <ul>
 * <li>A newline (line feed) character ('\n'),</li>
 * <li>A carriage-return character followed immediately by a newline character
 * ("\r\n"),</li>
 * <li>A standalone carriage-return character ('\r'),</li>
 * <li>A next-line character ('\u0085'),</li>
 * <li>A line-separator character ('\u2028'), or</li>
 * <li>A paragraph-separator character ('\u2029).</li>
 * </ul>
 * 
 * <p>
 * RE runs programs compiled by the RECompiler class. But the RE matcher class
 * does not include the actual regular expression compiler for reasons of
 * efficiency. You can construct a single RECompiler object and re-use it to
 * compile each expression. Similarly, you can change the program run by a given
 * matcher object at any time. However, RE and RECompiler are not threadsafe
 * (for efficiency reasons, and because requiring thread safety in this class is
 * deemed to be a rare requirement), so you will need to construct a separate
 * compiler or matcher object for each thread (unless you do thread
 * synchronization yourself). Once expression compiled into the REProgram
 * object, REProgram can be safely shared across multiple threads and RE
 * objects.
 * 
 * <br>
 * <p>
 * <br>
 * 
 * <font color="red"> <i>ISSUES:</i>
 * 
 * <ul>
 * <li>is not currently compatible with all standard POSIX regcomp flags</li>
 * <li>does not support POSIX equivalence classes ([=foo=] syntax) (I18N/locale
 * issue)</li>
 * <li>does not support nested POSIX character classes (definitely should, but
 * not completely trivial)</li>
 * <li>Does not support POSIX character collation concepts ([.foo.] syntax)
 * (I18N/locale issue)</li>
 * <li>Should there be different matching styles (simple, POSIX, Perl etc?)</li>
 * <li>Should RegEx support character iterators (for backwards RE matching!)?</li>
 * <li>Should RegEx support reluctant {m,n} closures (does anyone care)?</li>
 * <li>Not *all* possibilities are considered for greediness when backreferences
 * are involved (as POSIX suggests should be the case). The POSIX RE
 * "(ac*)c*d[ac]*\1", when matched against "acdacaa" should yield a match of
 * acdacaa where \1 is "a". This is not the case in this RE package, and
 * actually Perl doesn't go to this extent either! Until someone actually
 * complains about this, I'm not sure it's worth "fixing". If it ever is fixed,
 * test #137 in RETest.txt should be updated.</li>
 * </ul>
 * 
 * </font>
 * 
 */
public class RegEx {

	// Escape codes
	static final char E_ALNUM = 'w'; // Alphanumeric
	static final char E_BOUND = 'b'; // Word boundary
	static final char E_DIGIT = 'd'; // Digit
	static final char E_NALNUM = 'W'; // Non-alphanumeric
	static final char E_NBOUND = 'B'; // Non-word boundary
	static final char E_NDIGIT = 'D'; // Non-digit
	static final char E_NSPACE = 'S'; // Non-whitespace
	static final char E_SPACE = 's'; // Whitespace
	/**
	 * Flag to indicate that matching should be case-independent (folded)
	 */
	public static final int MATCH_CASEINDEPENDENT = 0x0001;
	/**
	 * Newlines should match as BOL/EOL (^ and $)
	 */
	public static final int MATCH_MULTILINE = 0x0002;
	/**
	 * Specifies normal, case-sensitive matching behaviour.
	 */
	public static final int MATCH_NORMAL = 0x0000;
	/**
	 * Consider all input a single body of text - newlines are matched by .
	 */
	public static final int MATCH_SINGLELINE = 0x0004;
	static final int MAX_PAREN = 16; // Number of paren pairs (only 9 can be
	// backrefs)
	// Limits
	static final int maxNode = 65536; // Maximum number of nodes in a program
	static final int nodeSize = 3; // Node size (in chars)
	static final int offsetNext = 2; // Next index offset (third char)
	// Node layout constants
	static final int offsetOpcode = 0; // Opcode offset (first character)
	static final int offsetOpdata = 1; // Opdata offset (second char)
	static final char OP_ANY = '.'; // match any single character except newline
	static final char OP_ANYOF = '['; // count/ranges match any char in the list
	// next one
	static final char OP_ATOM = 'A'; // length/string length of string followed
	static final char OP_BACKREF = '#'; // number reference nth already matched
	static final char OP_BOL = '^'; // match only if at beginning of line
	// of ranges
	static final char OP_BRANCH = '|'; // node match this alternative or the
	static final char OP_CLOSE = ')'; // number nth closing paren
	static final char OP_CLOSE_CLUSTER = '>'; // closing cluster
	static final char OP_CONTINUE = 'C'; // continue to the following command

	/************************************************
	 * * The format of a node in a program is: * * [ OPCODE ] [ OPDATA ] [
	 * OPNEXT ] [ OPERAND ] * * char OPCODE - instruction * char OPDATA -
	 * modifying data * char OPNEXT - next node (relative offset) * *
	 ************************************************/

	// Opcode Char Opdata/Operand Meaning
	// ---------- ---------- ---------------
	// --------------------------------------------------
	static final char OP_END = 'E'; // end of program
	static final char OP_EOL = '$'; // match only if at end of line
	static final char OP_ESCAPE = '\\'; // escape special escape code char class
	// parenthesized string
	static final char OP_GOTO = 'G'; // nothing but a (back-)pointer
	static final char OP_MAYBE = '?'; // node optional closure
	static final char OP_NOTHING = 'N'; // match null string such as in '(a|)'
	// (escape is E_* code)
	static final char OP_OPEN = '('; // number nth opening paren
	static final char OP_OPEN_CLUSTER = '<'; // opening cluster

	static final char OP_PLUS = '+'; // node positive closure
	// (mnemonic for char is
	// unshifted '?')
	static final char OP_POSIXCLASS = 'P'; // classid one of the posix character
	// classes
	// (mnemonic for char is
	// unshifted '+')
	static final char OP_RELUCTANTMAYBE = '/'; // none/expr reluctant '?'
	// (mnemonic for char is
	// unshifted '*')
	static final char OP_RELUCTANTPLUS = '='; // none/expr reluctant '+'
	// (ignore next)
	static final char OP_RELUCTANTSTAR = '8'; // none/expr reluctant '*'
	// by string itself
	static final char OP_STAR = '*'; // node kleene closure
	// Posix character classes
	static final char POSIX_CLASS_ALNUM = 'w'; // Alphanumerics
	static final char POSIX_CLASS_ALPHA = 'a'; // Alphabetics
	static final char POSIX_CLASS_BLANK = 'b'; // Blanks
	static final char POSIX_CLASS_CNTRL = 'c'; // Control characters
	static final char POSIX_CLASS_DIGIT = 'd'; // Digits
	static final char POSIX_CLASS_GRAPH = 'g'; // Graphic characters
	static final char POSIX_CLASS_JPART = 'k'; // Java identifier part
	static final char POSIX_CLASS_JSTART = 'j'; // Java identifier start

	static final char POSIX_CLASS_LOWER = 'l'; // Lowercase characters
	static final char POSIX_CLASS_PRINT = 'p'; // Printable characters

	static final char POSIX_CLASS_PUNCT = '!'; // Punctuation
	static final char POSIX_CLASS_SPACE = 's'; // Spaces
	static final char POSIX_CLASS_UPPER = 'u'; // Uppercase characters
	static final char POSIX_CLASS_XDIGIT = 'x'; // Hexadecimal digits

	/**
	 * Flag bit that indicates that subst should replace all occurrences of this
	 * regular expression.
	 */
	public static final int REPLACE_ALL = 0x0000;
	/**
	 * Flag bit that indicates that subst should replace backreferences
	 */
	public static final int REPLACE_BACKREFERENCES = 0x0002;
	/**
	 * Flag bit that indicates that subst should only replace the first
	 * occurrence of this regular expression.
	 */
	public static final int REPLACE_FIRSTONLY = 0x0001;

	/**
	 * Converts a 'simplified' regular expression to a full regular expression
	 * 
	 * @param pattern
	 *            The pattern to convert
	 * @return The full regular expression
	 */
	public static String simplePatternToFullRegularExpression(String pattern) {
		StringBuffer buf = new StringBuffer();
		for (int i = 0; i < pattern.length(); i++) {
			char c = pattern.charAt(i);
			switch (c) {
			case '*':
				buf.append(".*");
				break;

			case '.':
			case '[':
			case ']':
			case '\\':
			case '+':
			case '?':
			case '{':
			case '}':
			case '$':
			case '^':
			case '|':
			case '(':
			case ')':
				buf.append('\\');
			default:
				buf.append(c);
				break;
			}
		}
		return buf.toString();
	}

	transient int end0; // Cache of start[0]
	transient int end1; // Cache of start[1]
	transient int end2; // Cache of start[2]
	transient int[] endBackref; // Lazy-alloced array of backref ends
	transient int[] endn; // Lazy-alloced array of sub-expression ends
	int matchFlags; // Match behaviour flags
	int maxParen = MAX_PAREN;
	// Parenthesized subexpressions
	transient int parenCount; // Number of subexpressions matched (num open
	// State of current program
	RegExProgram program; // Compiled regular expression 'program'

	transient CharacterIterator search; // The string being matched against
	// parens + 1)
	transient int start0; // Cache of start[0]

	transient int start1; // Cache of start[1]

	transient int start2; // Cache of start[2]

	// Backreferences
	transient int[] startBackref; // Lazy-alloced array of backref starts

	transient int[] startn; // Lazy-alloced array of sub-expression starts

	/**
	 * Constructs a regular expression matcher with no initial program. This is
	 * likely to be an uncommon practice, but is still supported.
	 */
	public RegEx() {
		this((RegExProgram) null, MATCH_NORMAL);
	}

	/**
	 * Construct a matcher for a pre-compiled regular expression from program
	 * (bytecode) data.
	 * 
	 * @param program
	 *            Compiled regular expression program
	 * @see RegExCompiler
	 */
	public RegEx(RegExProgram program) {
		this(program, MATCH_NORMAL);
	}

	/**
	 * Construct a matcher for a pre-compiled regular expression from program
	 * (bytecode) data. Permits special flags to be passed in to modify matching
	 * behaviour.
	 * 
	 * @param program
	 *            Compiled regular expression program (see RECompiler)
	 * @param matchFlags
	 *            One or more of the RE match behaviour flags (RE.MATCH_*):
	 * 
	 *            <pre>
	 *   MATCH_NORMAL              // Normal (case-sensitive) matching
	 *   MATCH_CASEINDEPENDENT     // Case folded comparisons
	 *   MATCH_MULTILINE           // Newline matches as BOL/EOL
	 * </pre>
	 * 
	 * @see RegExCompiler
	 * @see RegExProgram
	 */
	public RegEx(RegExProgram program, int matchFlags) {
		setProgram(program);
		setMatchFlags(matchFlags);
	}

	/**
	 * Constructs a regular expression matcher from a String by compiling it
	 * using a new instance of RECompiler. If you will be compiling many
	 * expressions, you may prefer to use a single RECompiler object instead.
	 * 
	 * @param pattern
	 *            The regular expression pattern to compile.
	 * @exception RegExException
	 *                Thrown if the regular expression has invalid syntax.
	 * @see RegExCompiler
	 */
	public RegEx(String pattern) throws RegExException {
		this(pattern, MATCH_NORMAL);
	}

	/**
	 * Constructs a regular expression matcher from a String by compiling it
	 * using a new instance of RECompiler. If you will be compiling many
	 * expressions, you may prefer to use a single RECompiler object instead.
	 * 
	 * @param pattern
	 *            The regular expression pattern to compile.
	 * @param matchFlags
	 *            The matching style
	 * @exception RegExException
	 *                Thrown if the regular expression has invalid syntax.
	 * @see RegExCompiler
	 */
	public RegEx(String pattern, int matchFlags) throws RegExException {
		this(new RegExCompiler().compile(pattern), matchFlags);
	}

	/**
	 * Performs lazy allocation of subexpression arrays
	 */
	private void allocParens() {
		// Allocate arrays for subexpressions
		startn = new int[maxParen];
		endn = new int[maxParen];

		// Set sub-expression pointers to invalid values
		for (int i = 0; i < maxParen; i++) {
			startn[i] = -1;
			endn[i] = -1;
		}
	}

	/**
	 * Compares two characters.
	 * 
	 * @param c1
	 *            first character to compare.
	 * @param c2
	 *            second character to compare.
	 * @param caseIndependent
	 *            whether comparision is case insensitive or not.
	 * @return negative, 0, or positive integer as the first character less
	 *         than, equal to, or greater then the second.
	 */
	private int compareChars(char c1, char c2, boolean caseIndependent) {
		if (caseIndependent) {
			c1 = Character.toLowerCase(c1);
			c2 = Character.toLowerCase(c2);
		}
		return ((int) c1 - (int) c2);
	}

	/**
	 * Returns the current match behaviour flags.
	 * 
	 * @return Current match behaviour flags (RE.MATCH_*).
	 * 
	 *         <pre>
	 *   MATCH_NORMAL              // Normal (case-sensitive) matching
	 *   MATCH_CASEINDEPENDENT     // Case folded comparisons
	 *   MATCH_MULTILINE           // Newline matches as BOL/EOL
	 * </pre>
	 * 
	 * @see #setMatchFlags
	 */
	public int getMatchFlags() {
		return matchFlags;
	}

	/**
	 * Gets the contents of a parenthesized subexpression after a successful
	 * match.
	 * 
	 * @param which
	 *            Nesting level of subexpression
	 * @return String
	 */
	public String getParen(int which) {
		int start;
		if (which < parenCount && (start = getParenStart(which)) >= 0) {
			return search.substring(start, getParenEnd(which));
		}
		return null;
	}

	/**
	 * Returns the number of parenthesized subexpressions available after a
	 * successful match.
	 * 
	 * @return Number of available parenthesized subexpressions
	 */
	public int getParenCount() {
		return parenCount;
	}

	/**
	 * Returns the end index of a given paren level.
	 * 
	 * @param which
	 *            Nesting level of subexpression
	 * @return String index
	 */
	public final int getParenEnd(int which) {
		if (which < parenCount) {
			switch (which) {
			case 0:
				return end0;

			case 1:
				return end1;

			case 2:
				return end2;

			default:
				if (endn == null) {
					allocParens();
				}
				return endn[which];
			}
		}
		return -1;
	}

	/**
	 * Returns the length of a given paren level.
	 * 
	 * @param which
	 *            Nesting level of subexpression
	 * @return Number of characters in the parenthesized subexpression
	 */
	public final int getParenLength(int which) {
		if (which < parenCount) {
			return getParenEnd(which) - getParenStart(which);
		}
		return -1;
	}

	/**
	 * Returns the start index of a given paren level.
	 * 
	 * @param which
	 *            Nesting level of subexpression
	 * @return String index
	 */
	public final int getParenStart(int which) {
		if (which < parenCount) {
			switch (which) {
			case 0:
				return start0;

			case 1:
				return start1;

			case 2:
				return start2;

			default:
				if (startn == null) {
					allocParens();
				}
				return startn[which];
			}
		}
		return -1;
	}

	/**
	 * Returns the current regular expression program in use by this matcher
	 * object.
	 * 
	 * @return Regular expression program
	 * @see #setProgram
	 */
	public RegExProgram getProgram() {
		return program;
	}

	/**
	 * Returns an array of Strings, whose toString representation matches a
	 * regular expression. This method works like the Perl function of the same
	 * name. Given a regular expression of "a*b" and an array of String objects
	 * of [foo, aab, zzz, aaaab], the array of Strings returned by grep would be
	 * [aab, aaaab].
	 * 
	 * @param search
	 *            Array of Objects to search
	 * @return Array of Strings whose toString() value matches this regular
	 *         expression.
	 */
	public String[] grep(Object[] search) {
		// Create new vector to hold return items
		Vector v = new Vector();

		// Traverse array of objects
		for (int i = 0; i < search.length; i++) {
			// Get next object as a string
			String s = search[i].toString();

			// If it matches this regexp, add it to the list
			if (match(s)) {
				v.addElement(s);
			}
		}

		// Return vector as an array of strings
		String[] ret = new String[v.size()];
		v.copyInto(ret);
		return ret;
	}

	/**
	 * Throws an Error representing an internal error condition probably
	 * resulting from a bug in the regular expression compiler (or possibly data
	 * corruption). In practice, this should be very rare.
	 * 
	 * @param s
	 *            Error description
	 */
	protected void internalError(String s) throws Error {
		throw new Error("RE internal error: " + s);
	}

	/**
	 * @return true if character at i-th position in the <code>search</code>
	 *         string is a newline
	 */
	private boolean isNewline(int i) {
		char nextChar = search.charAt(i);

		return nextChar == '\n' || nextChar == '\r' || nextChar == '\u0085' || nextChar == '\u2028'
				|| nextChar == '\u2029';
	}

	/**
	 * Matches the current regular expression program against a character array,
	 * starting at a given index.
	 * 
	 * @param search
	 *            String to match against
	 * @param i
	 *            Index to start searching at
	 * @return True if string matched
	 */
	public boolean match(CharacterIterator search, int i) {
		// There is no compiled program to search with!
		if (program == null) {
			// This should be uncommon enough to be an error case rather
			// than an exception (which would have to be handled everywhere)
			internalError("No RE program to run!");
		}

		// Save string to search
		this.search = search;

		// Can we optimize the search by looking for new lines?
		if ((program.flags & RegExProgram.OPT_HASBOL) == RegExProgram.OPT_HASBOL) {
			// Non multi-line matching with BOL: Must match at '0' index
			if ((matchFlags & MATCH_MULTILINE) == 0) {
				return i == 0 && matchAt(i);
			}

			// Multi-line matching with BOL: Seek to next line
			for (; !search.isEnd(i); i++) {
				// Skip if we are at the beginning of the line
				if (isNewline(i)) {
					continue;
				}

				// Match at the beginning of the line
				if (matchAt(i)) {
					return true;
				}

				// Skip to the end of line
				for (; !search.isEnd(i); i++) {
					if (isNewline(i)) {
						break;
					}
				}
			}

			return false;
		}

		// Can we optimize the search by looking for a prefix string?
		if (program.prefix == null) {
			// Unprefixed matching must try for a match at each character
			for (; !search.isEnd(i - 1); i++) {
				// Try a match at index i
				if (matchAt(i)) {
					return true;
				}
			}
			return false;
		} else {
			// Prefix-anchored matching is possible
			boolean caseIndependent = (matchFlags & MATCH_CASEINDEPENDENT) != 0;
			char[] prefix = program.prefix;
			for (; !search.isEnd(i + prefix.length - 1); i++) {
				int j = i;
				int k = 0;

				boolean match;
				do {
					// If there's a mismatch of any character in the prefix,
					// give up
					match = (compareChars(search.charAt(j++), prefix[k++], caseIndependent) == 0);
				} while (match && k < prefix.length);

				// See if the whole prefix string matched
				if (k == prefix.length) {
					// We matched the full prefix at firstChar, so try it
					if (matchAt(i)) {
						return true;
					}
				}
			}
			return false;
		}
	}

	/**
	 * Matches the current regular expression program against a String.
	 * 
	 * @param search
	 *            String to match against
	 * @return True if string matched
	 */
	public boolean match(String search) {
		return match(search, 0);
	}

	/**
	 * Matches the current regular expression program against a character array,
	 * starting at a given index.
	 * 
	 * @param search
	 *            String to match against
	 * @param i
	 *            Index to start searching at
	 * @return True if string matched
	 */
	public boolean match(String search, int i) {
		return match(new StringCharacterIterator(search), i);
	}

	/**
	 * Match the current regular expression program against the current input
	 * string, starting at index i of the input string. This method is only
	 * meant for internal use.
	 * 
	 * @param i
	 *            The input string index to start matching at
	 * @return True if the input matched the expression
	 */
	protected boolean matchAt(int i) {
		// Initialize start pointer, paren cache and paren count
		start0 = -1;
		end0 = -1;
		start1 = -1;
		end1 = -1;
		start2 = -1;
		end2 = -1;
		startn = null;
		endn = null;
		parenCount = 1;
		setParenStart(0, i);

		// Allocate backref arrays (unless optimizations indicate otherwise)
		if ((program.flags & RegExProgram.OPT_HASBACKREFS) != 0) {
			startBackref = new int[maxParen];
			endBackref = new int[maxParen];
		}

		// Match against string
		int idx;
		if ((idx = matchNodes(0, maxNode, i)) != -1) {
			setParenEnd(0, idx);
			return true;
		}

		// Didn't match
		parenCount = 0;
		return false;
	}

	/**
	 * Try to match a string against a subset of nodes in the program
	 * 
	 * @param firstNode
	 *            Node to start at in program
	 * @param lastNode
	 *            Last valid node (used for matching a subexpression without
	 *            matching the rest of the program as well).
	 * @param idxStart
	 *            Starting position in character array
	 * @return Final input array index if match succeeded. -1 if not.
	 */
	protected int matchNodes(int firstNode, int lastNode, int idxStart) {
		// Our current place in the string
		int idx = idxStart;

		// Loop while node is valid
		int next, opcode, opdata;
		int idxNew;
		char[] instruction = program.instruction;
		for (int node = firstNode; node < lastNode;) {
			opcode = instruction[node /* + offsetOpcode */];
			next = node + (short) instruction[node + offsetNext];
			opdata = instruction[node + offsetOpdata];

			switch (opcode) {
			case OP_MAYBE:
			case OP_STAR: {
				// Try to match the following subexpr. If it matches:
				// MAYBE: Continues matching rest of the expression
				// STAR: Points back here to repeat subexpr matching
				if ((idxNew = matchNodes(node + nodeSize, maxNode, idx)) != -1) {
					return idxNew;
				}

				// If failed, just continue with the rest of expression
				break;
			}

			case OP_PLUS: {
				// Try to match the subexpr again (and again (and ...
				if ((idxNew = matchNodes(next, maxNode, idx)) != -1) {
					return idxNew;
				}

				// If failed, just continue with the rest of expression
				// Rest is located at the next pointer of the next instruction
				// (which must be OP_CONTINUE)
				node = next + (short) instruction[next + offsetNext];
				continue;
			}

			case OP_RELUCTANTMAYBE:
			case OP_RELUCTANTSTAR: {
				// Try to match the rest without using the reluctant subexpr
				if ((idxNew = matchNodes(next, maxNode, idx)) != -1) {
					return idxNew;
				}

				// Try reluctant subexpr. If it matches:
				// RELUCTANTMAYBE: Continues matching rest of the expression
				// RELUCTANTSTAR: Points back here to repeat reluctant star
				// matching
				return matchNodes(node + nodeSize, next, idx);
			}

			case OP_RELUCTANTPLUS: {
				// Continue matching the rest without using the reluctant
				// subexpr
				if ((idxNew = matchNodes(next + (short) instruction[next + offsetNext], maxNode, idx)) != -1) {
					return idxNew;
				}

				// Try to match subexpression again
				break;
			}

			case OP_OPEN:

				// Match subexpression
				if ((program.flags & RegExProgram.OPT_HASBACKREFS) != 0) {
					startBackref[opdata] = idx;
				}
				if ((idxNew = matchNodes(next, maxNode, idx)) != -1) {
					// Increase valid paren count
					if (opdata >= parenCount) {
						parenCount = opdata + 1;
					}

					// Don't set paren if already set later on
					if (getParenStart(opdata) == -1) {
						setParenStart(opdata, idx);
					}
				}
				return idxNew;

			case OP_CLOSE:

				// Done matching subexpression
				if ((program.flags & RegExProgram.OPT_HASBACKREFS) != 0) {
					endBackref[opdata] = idx;
				}
				if ((idxNew = matchNodes(next, maxNode, idx)) != -1) {
					// Increase valid paren count
					if (opdata >= parenCount) {
						parenCount = opdata + 1;
					}

					// Don't set paren if already set later on
					if (getParenEnd(opdata) == -1) {
						setParenEnd(opdata, idx);
					}
				}
				return idxNew;

			case OP_BACKREF: {
				// Get the start and end of the backref
				int s = startBackref[opdata];
				int e = endBackref[opdata];

				// We don't know the backref yet
				if (s == -1 || e == -1) {
					return -1;
				}

				// The backref is empty size
				if (s == e) {
					break;
				}

				// Get the length of the backref
				int l = e - s;

				// If there's not enough input left, give up.
				if (search.isEnd(idx + l - 1)) {
					return -1;
				}

				// Case fold the backref?
				final boolean caseFold = ((matchFlags & MATCH_CASEINDEPENDENT) != 0);
				// Compare backref to input
				for (int i = 0; i < l; i++) {
					if (compareChars(search.charAt(idx++), search.charAt(s + i), caseFold) != 0) {
						return -1;
					}
				}
			}
				break;

			case OP_BOL:

				// Fail if we're not at the start of the string
				if (idx != 0) {
					// If we're multiline matching, we could still be at the
					// start of a line
					if ((matchFlags & MATCH_MULTILINE) == MATCH_MULTILINE) {
						// Continue if at the start of a line
						if (isNewline(idx - 1)) {
							break;
						}
					}
					return -1;
				}
				break;

			case OP_EOL:

				// If we're not at the end of string
				if (!search.isEnd(0) && !search.isEnd(idx)) {
					// If we're multi-line matching
					if ((matchFlags & MATCH_MULTILINE) == MATCH_MULTILINE) {
						// Continue if we're at the end of a line
						if (isNewline(idx)) {
							break;
						}
					}
					return -1;
				}
				break;

			case OP_ESCAPE:

				// Which escape?
				switch (opdata) {
				// Word boundary match
				case E_NBOUND:
				case E_BOUND: {
					char cLast = ((idx == 0) ? '\n' : search.charAt(idx - 1));
					char cNext = ((search.isEnd(idx)) ? '\n' : search.charAt(idx));
					if ((CharUtil.isLetterOrDigit(cLast) == CharUtil.isLetterOrDigit(cNext)) == (opdata == E_BOUND)) {
						return -1;
					}
				}
					break;

				// Alpha-numeric, digit, space, javaLetter, javaLetterOrDigit
				case E_ALNUM:
				case E_NALNUM:
				case E_DIGIT:
				case E_NDIGIT:
				case E_SPACE:
				case E_NSPACE:

					// Give up if out of input
					if (search.isEnd(idx)) {
						return -1;
					}

					char c = search.charAt(idx);

					// Switch on escape
					switch (opdata) {
					case E_ALNUM:
					case E_NALNUM:
						if (!((CharUtil.isLetterOrDigit(c) || c == '_') == (opdata == E_ALNUM))) {
							return -1;
						}
						break;

					case E_DIGIT:
					case E_NDIGIT:
						if (!(CharUtil.isDigit(c) == (opdata == E_DIGIT))) {
							return -1;
						}
						break;

					case E_SPACE:
					case E_NSPACE:
						if (!(CharUtil.isWhitespace(c) == (opdata == E_SPACE))) {
							return -1;
						}
						break;
					}
					idx++;
					break;

				default:
					internalError("Unrecognized escape '" + opdata + "'");
				}
				break;

			case OP_ANY:

				if ((matchFlags & MATCH_SINGLELINE) == MATCH_SINGLELINE) {
					// Match anything
					if (search.isEnd(idx)) {
						return -1;
					}
				} else {
					// Match anything but a newline
					if (search.isEnd(idx) || isNewline(idx)) {
						return -1;
					}
				}
				idx++;
				break;

			case OP_ATOM: {
				// Match an atom value
				if (search.isEnd(idx)) {
					return -1;
				}

				// Get length of atom and starting index
				// int lenAtom = opdata;
				int startAtom = node + nodeSize;

				// Give up if not enough input remains to have a match
				if (search.isEnd(opdata + idx - 1)) {
					return -1;
				}

				// Match atom differently depending on casefolding flag
				final boolean caseFold = ((matchFlags & MATCH_CASEINDEPENDENT) != 0);

				for (int i = 0; i < opdata; i++) {
					if (compareChars(search.charAt(idx++), instruction[startAtom + i], caseFold) != 0) {
						return -1;
					}
				}
			}
				break;

			case OP_POSIXCLASS: {
				// Out of input?
				if (search.isEnd(idx)) {
					return -1;
				}

				switch (opdata) {
				case POSIX_CLASS_ALNUM:
					if (!CharUtil.isLetterOrDigit(search.charAt(idx))) {
						return -1;
					}
					break;

				case POSIX_CLASS_ALPHA:
					if (!CharUtil.isLetter(search.charAt(idx))) {
						return -1;
					}
					break;

				case POSIX_CLASS_DIGIT:
					if (!CharUtil.isDigit(search.charAt(idx))) {
						return -1;
					}
					break;

				case POSIX_CLASS_BLANK: // JWL - bugbug: is this right??
					if (!CharUtil.isSpaceChar(search.charAt(idx))) {
						return -1;
					}
					break;

				case POSIX_CLASS_SPACE:
					if (!CharUtil.isWhitespace(search.charAt(idx))) {
						return -1;
					}
					break;

				case POSIX_CLASS_CNTRL:
					if (CharUtil.getType(search.charAt(idx)) != CharUtil.CONTROL) {
						return -1;
					}
					break;

				case POSIX_CLASS_GRAPH: // JWL - bugbug???
					switch (CharUtil.getType(search.charAt(idx))) {
					case CharUtil.MATH_SYMBOL:
					case CharUtil.CURRENCY_SYMBOL:
					case CharUtil.MODIFIER_SYMBOL:
					case CharUtil.OTHER_SYMBOL:
						break;

					default:
						return -1;
					}
					break;

				case POSIX_CLASS_LOWER:
					if (CharUtil.getType(search.charAt(idx)) != CharUtil.LOWERCASE_LETTER) {
						return -1;
					}
					break;

				case POSIX_CLASS_UPPER:
					if (CharUtil.getType(search.charAt(idx)) != CharUtil.UPPERCASE_LETTER) {
						return -1;
					}
					break;

				case POSIX_CLASS_PRINT:
					if (CharUtil.getType(search.charAt(idx)) == CharUtil.CONTROL) {
						return -1;
					}
					break;

				case POSIX_CLASS_PUNCT: {
					int type = CharUtil.getType(search.charAt(idx));
					switch (type) {
					case CharUtil.DASH_PUNCTUATION:
					case CharUtil.START_PUNCTUATION:
					case CharUtil.END_PUNCTUATION:
					case CharUtil.CONNECTOR_PUNCTUATION:
					case CharUtil.OTHER_PUNCTUATION:
						break;

					default:
						return -1;
					}
				}
					break;

				case POSIX_CLASS_XDIGIT: // JWL - bugbug??
				{
					boolean isXDigit = ((search.charAt(idx) >= '0' && search.charAt(idx) <= '9')
							|| (search.charAt(idx) >= 'a' && search.charAt(idx) <= 'f') || (search.charAt(idx) >= 'A' && search
							.charAt(idx) <= 'F'));
					if (!isXDigit) {
						return -1;
					}
				}
					break;

				case POSIX_CLASS_JSTART:
					if (!CharUtil.isJavaIdentifierStart(search.charAt(idx))) {
						return -1;
					}
					break;

				case POSIX_CLASS_JPART:
					if (!CharUtil.isJavaIdentifierPart(search.charAt(idx))) {
						return -1;
					}
					break;

				default:
					internalError("Bad posix class");
					break;
				}

				// Matched.
				idx++;
			}
				break;

			case OP_ANYOF: {
				// Out of input?
				if (search.isEnd(idx)) {
					return -1;
				}

				// Get character to match against character class and maybe
				// casefold
				char c = search.charAt(idx);
				boolean caseFold = (matchFlags & MATCH_CASEINDEPENDENT) != 0;
				// Loop through character class checking our match character
				int idxRange = node + nodeSize;
				int idxEnd = idxRange + (opdata * 2);
				boolean match = false;
				for (int i = idxRange; !match && i < idxEnd;) {
					// Get start, end and match characters
					char s = instruction[i++];
					char e = instruction[i++];

					match = ((compareChars(c, s, caseFold) >= 0) && (compareChars(c, e, caseFold) <= 0));
				}

				// Fail if we didn't match the character class
				if (!match) {
					return -1;
				}
				idx++;
			}
				break;

			case OP_BRANCH: {
				// Check for choices
				// FIXME Dead code - only reason to keep is backward compat with
				// pre-compiled exprs. Remove?
				if (instruction[next /* + offsetOpcode */] != OP_BRANCH) {
					// If there aren't any other choices, just evaluate this
					// branch.
					node += nodeSize;
					continue;
				}

				// Try all available branches
				int nextBranch;
				do {
					// Try matching the branch against the string
					if ((idxNew = matchNodes(node + nodeSize, maxNode, idx)) != -1) {
						return idxNew;
					}

					// Go to next branch (if any)
					nextBranch = (short) instruction[node + offsetNext];
					node += nextBranch;
				} while (nextBranch != 0 && (instruction[node /* + offsetOpcode */] == OP_BRANCH));

				// Failed to match any branch!
				return -1;
			}

			case OP_OPEN_CLUSTER:
			case OP_CLOSE_CLUSTER:
				// starting or ending the matching of a subexpression which has
				// no backref.

			case OP_NOTHING:
			case OP_GOTO:

				// Just advance to the next node without doing anything
				break;

			case OP_CONTINUE:

				// Advance to the following node
				node += nodeSize;
				continue;

			case OP_END:

				// Match has succeeded!
				setParenEnd(0, idx);
				return idx;

			default:

				// Corrupt program
				internalError("Invalid opcode '" + opcode + "'");
			}

			// Advance to the next node in the program
			node = next;
		}

		// We "should" never end up here
		internalError("Corrupt program");
		return -1;
	}

	/**
	 * Sets match behaviour flags which alter the way RE does matching.
	 * 
	 * @param matchFlags
	 *            One or more of the RE match behaviour flags (RE.MATCH_*):
	 * 
	 *            <pre>
	 *   MATCH_NORMAL              // Normal (case-sensitive) matching
	 *   MATCH_CASEINDEPENDENT     // Case folded comparisons
	 *   MATCH_MULTILINE           // Newline matches as BOL/EOL
	 * </pre>
	 */
	public void setMatchFlags(int matchFlags) {
		this.matchFlags = matchFlags;
	}

	/**
	 * Sets the end of a paren level
	 * 
	 * @param which
	 *            Which paren level
	 * @param i
	 *            Index in input array
	 */
	protected final void setParenEnd(int which, int i) {
		if (which < parenCount) {
			switch (which) {
			case 0:
				end0 = i;
				break;

			case 1:
				end1 = i;
				break;

			case 2:
				end2 = i;
				break;

			default:
				if (endn == null) {
					allocParens();
				}
				endn[which] = i;
				break;
			}
		}
	}

	/**
	 * Sets the start of a paren level
	 * 
	 * @param which
	 *            Which paren level
	 * @param i
	 *            Index in input array
	 */
	protected final void setParenStart(int which, int i) {
		if (which < parenCount) {
			switch (which) {
			case 0:
				start0 = i;
				break;

			case 1:
				start1 = i;
				break;

			case 2:
				start2 = i;
				break;

			default:
				if (startn == null) {
					allocParens();
				}
				startn[which] = i;
				break;
			}
		}
	}

	/**
	 * Sets the current regular expression program used by this matcher object.
	 * 
	 * @param program
	 *            Regular expression program compiled by RECompiler.
	 * @see RegExCompiler
	 * @see RegExProgram
	 */
	public void setProgram(RegExProgram program) {
		this.program = program;
		if (program != null && program.maxParens != -1) {
			this.maxParen = program.maxParens;
		} else {
			this.maxParen = MAX_PAREN;
		}
	}

	/**
	 * Splits a string into an array of strings on regular expression
	 * boundaries. This function works the same way as the Perl function of the
	 * same name. Given a regular expression of "[ab]+" and a string to split of
	 * "xyzzyababbayyzabbbab123", the result would be the array of Strings
	 * "[xyzzy, yyz, 123]".
	 * 
	 * <p>
	 * Please note that the first string in the resulting array may be an empty
	 * string. This happens when the very first character of input string is
	 * matched by the pattern.
	 * 
	 * @param s
	 *            String to split on this regular exression
	 * @return Array of strings
	 */
	public String[] split(String s) {
		// Create new vector
		Vector v = new Vector();

		// Start at position 0 and search the whole string
		int pos = 0;
		int len = s.length();

		// Try a match at each position
		while (pos < len && match(s, pos)) {
			// Get start of match
			int start = getParenStart(0);

			// Get end of match
			int newpos = getParenEnd(0);

			// Check if no progress was made
			if (newpos == pos) {
				v.addElement(s.substring(pos, start + 1));
				newpos++;
			} else {
				v.addElement(s.substring(pos, start));
			}

			// Move to new position
			pos = newpos;
		}

		// Push remainder if it's not empty
		String remainder = s.substring(pos);
		if (remainder.length() != 0) {
			v.addElement(remainder);
		}

		// Return vector as an array of strings
		String[] ret = new String[v.size()];
		v.copyInto(ret);
		return ret;
	}

	/**
	 * Substitutes a string for this regular expression in another string. This
	 * method works like the Perl function of the same name. Given a regular
	 * expression of "a*b", a String to substituteIn of
	 * "aaaabfooaaabgarplyaaabwackyb" and the substitution String "-", the
	 * resulting String returned by subst would be "-foo-garply-wacky-".
	 * 
	 * @param substituteIn
	 *            String to substitute within
	 * @param substitution
	 *            String to substitute for all matches of this regular
	 *            expression.
	 * @return The string substituteIn with zero or more occurrences of the
	 *         current regular expression replaced with the substitution String
	 *         (if this regular expression object doesn't match at any position,
	 *         the original String is returned unchanged).
	 */
	public String subst(String substituteIn, String substitution) {
		return subst(substituteIn, substitution, REPLACE_ALL);
	}

	/**
	 * Substitutes a string for this regular expression in another string. This
	 * method works like the Perl function of the same name. Given a regular
	 * expression of "a*b", a String to substituteIn of
	 * "aaaabfooaaabgarplyaaabwackyb" and the substitution String "-", the
	 * resulting String returned by subst would be "-foo-garply-wacky-".
	 * <p>
	 * It is also possible to reference the contents of a parenthesized
	 * expression with $0, $1, ... $9. A regular expression of
	 * "http://[\\.\\w\\-\\?/~_@&=%]+", a String to substituteIn of
	 * "visit us: http://www.apache.org!" and the substitution String
	 * "&lt;a href=\"$0\"&gt;$0&lt;/a&gt;", the resulting String returned by
	 * subst would be"visit us: &lt;a href=\"http://www.apache.org\"&gt;http://www.apache.org&lt;/a&gt;!"
	 * .
	 * <p>
	 * <i>Note:</i> $0 represents the whole match.
	 * 
	 * @param substituteIn
	 *            String to substitute within
	 * @param substitution
	 *            String to substitute for matches of this regular expression
	 * @param flags
	 *            One or more bitwise flags from REPLACE_*. If the
	 *            REPLACE_FIRSTONLY flag bit is set, only the first occurrence
	 *            of this regular expression is replaced. If the bit is not set
	 *            (REPLACE_ALL), all occurrences of this pattern will be
	 *            replaced. If the flag REPLACE_BACKREFERENCES is set, all
	 *            backreferences will be processed.
	 * @return The string substituteIn with zero or more occurrences of the
	 *         current regular expression replaced with the substitution String
	 *         (if this regular expression object doesn't match at any position,
	 *         the original String is returned unchanged).
	 */
	public String subst(String substituteIn, String substitution, int flags) {
		// String to return
		StringBuffer ret = new StringBuffer();

		// Start at position 0 and search the whole string
		int pos = 0;
		int len = substituteIn.length();

		// Try a match at each position
		while (pos < len && match(substituteIn, pos)) {
			// Append string before match
			ret.append(substituteIn.substring(pos, getParenStart(0)));

			if ((flags & REPLACE_BACKREFERENCES) != 0) {
				// Process backreferences
				int lCurrentPosition = 0;
				int lLastPosition = -2;
				int lLength = substitution.length();

				while ((lCurrentPosition = substitution.indexOf("$", lCurrentPosition)) >= 0) {
					if ((lCurrentPosition == 0 || substitution.charAt(lCurrentPosition - 1) != '\\')
							&& lCurrentPosition + 1 < lLength) {
						char c = substitution.charAt(lCurrentPosition + 1);
						if (c >= '0' && c <= '9') {
							// Append everything between the last and the
							// current $ sign
							ret.append(substitution.substring(lLastPosition + 2, lCurrentPosition));

							// Append the parenthesized expression, if present
							String val = getParen(c - '0');
							if (val != null) {
								ret.append(val);
							}
							lLastPosition = lCurrentPosition;
						}
					}

					// Move forward, skipping past match
					lCurrentPosition++;
				}

				// Append everything after the last $ sign
				ret.append(substitution.substring(lLastPosition + 2, lLength));
			} else {
				// Append substitution without processing backreferences
				ret.append(substitution);
			}

			// Move forward, skipping past match
			int newpos = getParenEnd(0);

			// We always want to make progress!
			if (newpos == pos) {
				newpos++;
			}

			// Try new position
			pos = newpos;

			// Break out if we're only supposed to replace one occurrence
			if ((flags & REPLACE_FIRSTONLY) != 0) {
				break;
			}
		}

		// If there's remaining input, append it
		if (pos < len) {
			ret.append(substituteIn.substring(pos));
		}

		// Return string buffer as string
		return ret.toString();
	}
}
